In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_beneficiary = pd.read_csv("/content/Train_Beneficiarydata-1542865627584.csv")
train_inpatient = pd.read_csv("/content/Train_Inpatientdata-1542865627584.csv")
train_outpatient = pd.read_csv("/content/Train_Outpatientdata-1542865627584.csv")
train_provider = pd.read_csv("/content/Train-1542865627584.csv") # Assuming this is the provider labels file

unseen_beneficiary = pd.read_csv("/content/Unseen_Beneficiarydata-1542969243754.csv")
unseen_inpatient = pd.read_csv("/content/Unseen_Inpatientdata-1542969243754.csv")
unseen_outpatient = pd.read_csv("/content/Unseen_Outpatientdata-1542969243754.csv")
unseen_providers = pd.read_csv("/content/Unseen-1542969243754.csv")

print("Training Beneficiary Data Shape:", train_beneficiary.shape)
print("Training Inpatient Data Shape:", train_inpatient.shape)
print("Training Outpatient Data Shape:", train_outpatient.shape)
print("Training Provider Data Shape:", train_provider.shape)
print("Unseen Beneficiary Data Shape:", unseen_beneficiary.shape)
print("Unseen Inpatient Data Shape:", unseen_inpatient.shape)
print("Unseen Outpatient Data Shape:", unseen_outpatient.shape)
print("Unseen Providers Data Shape:", unseen_providers.shape)
Training Beneficiary Data Shape: (138556, 25)
Training Inpatient Data Shape: (40474, 30)
Training Outpatient Data Shape: (517737, 27)
Training Provider Data Shape: (5410, 2)
Unseen Beneficiary Data Shape: (63968, 25)
Unseen Inpatient Data Shape: (9551, 30)
Unseen Outpatient Data Shape: (125841, 27)
Unseen Providers Data Shape: (1353, 1)
In [20]:
train_claims = pd.concat([train_inpatient, train_outpatient], ignore_index=True)

train_merged = pd.merge(train_claims, train_beneficiary, on='BeneID', how='left')

train_df = pd.merge(train_merged, train_provider, on='Provider', how='left')

print("Merged Training Data Shape:", train_df.shape)
print(train_df.head())
print("\nValue counts of 'PotentialFraud':")
print(train_df['PotentialFraud'].value_counts())
Merged Training Data Shape: (558211, 55)
      BeneID   ClaimID ClaimStartDt  ClaimEndDt  Provider  \
0  BENE11001  CLM46614   2009-04-12  2009-04-18  PRV55912   
1  BENE11001  CLM66048   2009-08-31  2009-09-02  PRV55907   
2  BENE11001  CLM68358   2009-09-17  2009-09-20  PRV56046   
3  BENE11011  CLM38412   2009-02-14  2009-02-22  PRV52405   
4  BENE11014  CLM63689   2009-08-13  2009-08-30  PRV56614   

   InscClaimAmtReimbursed AttendingPhysician OperatingPhysician  \
0                   26000          PHY390922                NaN   
1                    5000          PHY318495          PHY318495   
2                    5000          PHY372395                NaN   
3                    5000          PHY369659          PHY392961   
4                   10000          PHY379376          PHY398258   

  OtherPhysician AdmissionDt  ... ChronicCond_Diabetes  \
0            NaN  2009-04-12  ...                    1   
1            NaN  2009-08-31  ...                    1   
2      PHY324689  2009-09-17  ...                    1   
3      PHY349768  2009-02-14  ...                    1   
4            NaN  2009-08-13  ...                    2   

   ChronicCond_IschemicHeart ChronicCond_Osteoporasis  \
0                          1                        2   
1                          1                        2   
2                          1                        2   
3                          2                        2   
4                          1                        2   

  ChronicCond_rheumatoidarthritis ChronicCond_stroke IPAnnualReimbursementAmt  \
0                               1                  1                    36000   
1                               1                  1                    36000   
2                               1                  1                    36000   
3                               1                  1                     5000   
4                               2                  2                    21260   

  IPAnnualDeductibleAmt OPAnnualReimbursementAmt OPAnnualDeductibleAmt  \
0                  3204                       60                    70   
1                  3204                       60                    70   
2                  3204                       60                    70   
3                  1068                      250                   320   
4                  2136                      120                   100   

  PotentialFraud  
0            Yes  
1             No  
2             No  
3             No  
4             No  

[5 rows x 55 columns]

Value counts of 'PotentialFraud':
PotentialFraud
No     345415
Yes    212796
Name: count, dtype: int64
In [21]:
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='PotentialFraud')
plt.title('Distribution of Potential Fraud Cases')
plt.xlabel('PotentialFraud')
plt.ylabel('Number of Providers')
plt.show()

print("\nProportion of Potential Fraud Cases:")
print(train_df['PotentialFraud'].value_counts(normalize=True))
No description has been provided for this image
Proportion of Potential Fraud Cases:
PotentialFraud
No     0.618789
Yes    0.381211
Name: proportion, dtype: float64
In [22]:
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove identifier columns and potentially some date-related or aggregated amounts
numerical_cols = [col for col in numerical_cols if col not in ['BeneID', 'ClaimID', 'Provider', 'ClaimStartDt', 'ClaimEndDt',
                                                               'AdmissionDt', 'DischargeDt', 'DOB', 'DOD']]

for col in numerical_cols:
    if col in train_df.columns:
        # 1. Check data type and convert if needed.. Basically we'll perform EDA
        if train_df[col].dtype not in ['int64', 'float64']:
            try:
                train_df[col] = pd.to_numeric(train_df[col], errors='coerce')  # Convert to numeric, NaNs for failures
                print(f"Converted column '{col}' to numeric.")
            except ValueError:
                print(f"Column '{col}' cannot be converted to numeric. Skipping boxplot.")
                continue  # Skip to the next column


        unique_value_count = train_df[col].nunique()
        non_nan_count = train_df[col].count()

        if unique_value_count < 2 or non_nan_count == 0:
            print(f"Column '{col}' has too few unique values or all NaNs. Skipping boxplot.")
            continue

        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.histplot(train_df[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}')

        plt.subplot(1, 2, 2)
        sns.boxplot(x='PotentialFraud', y=col, data=train_df)
        plt.title(f'{col} by Potential Fraud')
        plt.tight_layout()
        plt.show()

        print(f"\nDescriptive Statistics for {col} by Potential Fraud:")
        print(train_df.groupby('PotentialFraud')[col].describe())
    else:
        print(f"Column '{col}' not found in train_df.")
No description has been provided for this image
Descriptive Statistics for InscClaimAmtReimbursed by Potential Fraud:
                   count         mean          std  min   25%   50%    75%  \
PotentialFraud                                                               
No              345415.0   755.213352  3056.460166  0.0  40.0  80.0  300.0   
Yes             212796.0  1389.505066  4785.074685  0.0  40.0  90.0  400.0   

                     max  
PotentialFraud            
No              125000.0  
Yes             125000.0  
No description has been provided for this image
Descriptive Statistics for DeductibleAmtPaid by Potential Fraud:
                   count        mean         std  min  25%  50%  75%     max
PotentialFraud                                                              
No              345024.0   54.282430  228.997887  0.0  0.0  0.0  0.0  1068.0
Yes             212288.0  117.652769  330.762589  0.0  0.0  0.0  0.0  1068.0
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_1 by Potential Fraud:
                  count         mean          std   min     25%     50%  \
PotentialFraud                                                            
No               9786.0  5866.282240  3060.127143  13.0  3818.0  5188.0   
Yes             13524.0  5917.770334  3043.427785  11.0  3891.0  5451.0   

                   75%     max  
PotentialFraud                  
No              8674.0  9999.0  
Yes             8659.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_2 by Potential Fraud:
                 count         mean          std   min     25%     50%  \
PotentialFraud                                                           
No              2244.0  4132.491979  2019.352638  42.0  2752.0  4019.0   
Yes             3246.0  4088.291436  2040.207424  42.0  2724.0  4019.0   

                   75%     max  
PotentialFraud                  
No              4439.0  9999.0  
Yes             4439.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_3 by Potential Fraud:
                count         mean          std    min     25%     50%  \
PotentialFraud                                                           
No              406.0  4176.603448  2298.686770  185.0  2724.0  4019.0   
Yes             563.0  4253.229130  2271.136731   42.0  2724.0  4019.0   

                   75%     max  
PotentialFraud                  
No              4547.0  9982.0  
Yes             5234.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_4 by Potential Fraud:
                count         mean          std    min      25%     50%  \
PotentialFraud                                                            
No               42.0  4165.285714  2382.664877   42.0  2730.25  4019.0   
Yes              76.0  4017.750000  1834.577155  185.0  2761.75  4019.0   

                   75%     max  
PotentialFraud                  
No              4998.5  9986.0  
Yes             4439.0  9974.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_5 by Potential Fraud:
                count         mean          std     min     25%     50%  \
PotentialFraud                                                            
No                3.0  3761.666667   909.717722  2724.0  3431.5  4139.0   
Yes               6.0  6023.333333  3160.536073  2724.0  4139.0  4662.0   

                   75%     max  
PotentialFraud                  
No              4280.5  4422.0  
Yes             8774.5  9982.0  
Column 'ClmProcedureCode_6' has too few unique values or all NaNs. Skipping boxplot.
No description has been provided for this image
Descriptive Statistics for Gender by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.579017  0.493718  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.578549  0.493793  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for Race by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.241223  0.689281  1.0  1.0  1.0  1.0  5.0
Yes             212796.0  1.277392  0.760395  1.0  1.0  1.0  1.0  5.0
No description has been provided for this image
Descriptive Statistics for State by Potential Fraud:
                   count       mean        std  min   25%   50%   75%   max
PotentialFraud                                                             
No              345415.0  25.946276  15.085346  1.0  13.0  25.0  38.0  54.0
Yes             212796.0  24.636483  15.331032  1.0  10.0  23.0  37.0  54.0
No description has been provided for this image
Descriptive Statistics for County by Potential Fraud:
                   count        mean         std  min    25%    50%    75%  \
PotentialFraud                                                               
No              345415.0  376.183579  271.528928  0.0  140.0  340.0  570.0   
Yes             212796.0  382.491419  254.586720  0.0  160.0  360.0  570.0   

                  max  
PotentialFraud         
No              999.0  
Yes             999.0  
No description has been provided for this image
Descriptive Statistics for NoOfMonths_PartACov by Potential Fraud:
                   count       mean       std  min   25%   50%   75%   max
PotentialFraud                                                            
No              345415.0  11.927965  0.911568  0.0  12.0  12.0  12.0  12.0
Yes             212796.0  11.937165  0.853015  0.0  12.0  12.0  12.0  12.0
No description has been provided for this image
Descriptive Statistics for NoOfMonths_PartBCov by Potential Fraud:
                   count       mean       std  min   25%   50%   75%   max
PotentialFraud                                                            
No              345415.0  11.937562  0.795000  0.0  12.0  12.0  12.0  12.0
Yes             212796.0  11.940732  0.770898  0.0  12.0  12.0  12.0  12.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Alzheimer by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.600538  0.489788  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.594226  0.491042  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Heartfailure by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.413063  0.492385  1.0  1.0  1.0  2.0  2.0
Yes             212796.0  1.403908  0.490681  1.0  1.0  1.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_KidneyDisease by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.594294  0.491029  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.577779  0.493915  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Cancer by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.849668  0.357397  1.0  2.0  2.0  2.0  2.0
Yes             212796.0  1.846905  0.360080  1.0  2.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_ObstrPulmonary by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.691464  0.461890  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.679938  0.466501  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Depression by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.565375  0.495708  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.564898  0.495772  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Diabetes by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.296516  0.456722  1.0  1.0  1.0  2.0  2.0
Yes             212796.0  1.291505  0.454456  1.0  1.0  1.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_IschemicHeart by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.243467  0.429175  1.0  1.0  1.0  1.0  2.0
Yes             212796.0  1.236301  0.424810  1.0  1.0  1.0  1.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Osteoporasis by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.682784  0.465393  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.681653  0.465836  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_rheumatoidarthritis by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.690645  0.462229  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.685882  0.464165  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_stroke by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.900453  0.299395  1.0  2.0  2.0  2.0  2.0
Yes             212796.0  1.894754  0.306871  1.0  2.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for IPAnnualReimbursementAmt by Potential Fraud:
                   count         mean           std     min  25%  50%     75%  \
PotentialFraud                                                                  
No              345415.0  4903.940535  11388.465739 -8000.0  0.0  0.0  5000.0   
Yes             212796.0  5753.945375  12386.869727  -500.0  0.0  0.0  7000.0   

                     max  
PotentialFraud            
No              155600.0  
Yes             161470.0  
No description has been provided for this image
Descriptive Statistics for IPAnnualDeductibleAmt by Potential Fraud:
                   count        mean          std  min  25%  50%     75%  \
PotentialFraud                                                             
No              345415.0  534.962431  1137.325961  0.0  0.0  0.0  1068.0   
Yes             212796.0  623.612559  1242.152668  0.0  0.0  0.0  1068.0   

                    max  
PotentialFraud           
No              38272.0  
Yes             38272.0  
No description has been provided for this image
Descriptive Statistics for OPAnnualReimbursementAmt by Potential Fraud:
                   count         mean          std   min    25%     50%  \
PotentialFraud                                                            
No              345415.0  2271.897486  3834.023483 -50.0  470.0  1170.0   
Yes             212796.0  2288.496870  3958.231143 -70.0  460.0  1160.0   

                   75%       max  
PotentialFraud                    
No              2600.0  102960.0  
Yes             2570.0  102960.0  
No description has been provided for this image
Descriptive Statistics for OPAnnualDeductibleAmt by Potential Fraud:
                   count        mean          std  min    25%    50%    75%  \
PotentialFraud                                                                
No              345415.0  647.402655   990.644469  0.0  120.0  340.0  790.0   
Yes             212796.0  653.425807  1020.208428  0.0  120.0  340.0  800.0   

                    max  
PotentialFraud           
No              13840.0  
Yes             13840.0  
In [23]:
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

numerical_cols = [col for col in numerical_cols if col not in ['BeneID', 'ClaimID', 'Provider', 'ClaimStartDt', 'ClaimEndDt',
                                                               'AdmissionDt', 'DischargeDt', 'DOB', 'DOD']]

for col in numerical_cols:
    if col in train_df.columns:

        if train_df[col].dtype not in ['int64', 'float64']:
            try:
                train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
                print(f"Converted column '{col}' to numeric.")
            except ValueError:
                print(f"Column '{col}' cannot be converted to numeric. Skipping boxplot.")
                continue

        #Check for too few unique values or all NaNs
        unique_value_count = train_df[col].nunique()
        non_nan_count = train_df[col].count()

        if unique_value_count < 2 or non_nan_count == 0:
            print(f"Column '{col}' has too few unique values or all NaNs. Skipping boxplot.")
            continue

        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.histplot(train_df[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}')

        plt.subplot(1, 2, 2)
        sns.boxplot(x='PotentialFraud', y=col, data=train_df)
        plt.title(f'{col} by Potential Fraud')
        plt.tight_layout()
        plt.show()

        print(f"\nDescriptive Statistics for {col} by Potential Fraud:")
        print(train_df.groupby('PotentialFraud')[col].describe())
    else:
        print(f"Column '{col}' not found in train_df.")
No description has been provided for this image
Descriptive Statistics for InscClaimAmtReimbursed by Potential Fraud:
                   count         mean          std  min   25%   50%    75%  \
PotentialFraud                                                               
No              345415.0   755.213352  3056.460166  0.0  40.0  80.0  300.0   
Yes             212796.0  1389.505066  4785.074685  0.0  40.0  90.0  400.0   

                     max  
PotentialFraud            
No              125000.0  
Yes             125000.0  
No description has been provided for this image
Descriptive Statistics for DeductibleAmtPaid by Potential Fraud:
                   count        mean         std  min  25%  50%  75%     max
PotentialFraud                                                              
No              345024.0   54.282430  228.997887  0.0  0.0  0.0  0.0  1068.0
Yes             212288.0  117.652769  330.762589  0.0  0.0  0.0  0.0  1068.0
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_1 by Potential Fraud:
                  count         mean          std   min     25%     50%  \
PotentialFraud                                                            
No               9786.0  5866.282240  3060.127143  13.0  3818.0  5188.0   
Yes             13524.0  5917.770334  3043.427785  11.0  3891.0  5451.0   

                   75%     max  
PotentialFraud                  
No              8674.0  9999.0  
Yes             8659.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_2 by Potential Fraud:
                 count         mean          std   min     25%     50%  \
PotentialFraud                                                           
No              2244.0  4132.491979  2019.352638  42.0  2752.0  4019.0   
Yes             3246.0  4088.291436  2040.207424  42.0  2724.0  4019.0   

                   75%     max  
PotentialFraud                  
No              4439.0  9999.0  
Yes             4439.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_3 by Potential Fraud:
                count         mean          std    min     25%     50%  \
PotentialFraud                                                           
No              406.0  4176.603448  2298.686770  185.0  2724.0  4019.0   
Yes             563.0  4253.229130  2271.136731   42.0  2724.0  4019.0   

                   75%     max  
PotentialFraud                  
No              4547.0  9982.0  
Yes             5234.0  9999.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_4 by Potential Fraud:
                count         mean          std    min      25%     50%  \
PotentialFraud                                                            
No               42.0  4165.285714  2382.664877   42.0  2730.25  4019.0   
Yes              76.0  4017.750000  1834.577155  185.0  2761.75  4019.0   

                   75%     max  
PotentialFraud                  
No              4998.5  9986.0  
Yes             4439.0  9974.0  
No description has been provided for this image
Descriptive Statistics for ClmProcedureCode_5 by Potential Fraud:
                count         mean          std     min     25%     50%  \
PotentialFraud                                                            
No                3.0  3761.666667   909.717722  2724.0  3431.5  4139.0   
Yes               6.0  6023.333333  3160.536073  2724.0  4139.0  4662.0   

                   75%     max  
PotentialFraud                  
No              4280.5  4422.0  
Yes             8774.5  9982.0  
Column 'ClmProcedureCode_6' has too few unique values or all NaNs. Skipping boxplot.
No description has been provided for this image
Descriptive Statistics for Gender by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.579017  0.493718  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.578549  0.493793  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for Race by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.241223  0.689281  1.0  1.0  1.0  1.0  5.0
Yes             212796.0  1.277392  0.760395  1.0  1.0  1.0  1.0  5.0
No description has been provided for this image
Descriptive Statistics for State by Potential Fraud:
                   count       mean        std  min   25%   50%   75%   max
PotentialFraud                                                             
No              345415.0  25.946276  15.085346  1.0  13.0  25.0  38.0  54.0
Yes             212796.0  24.636483  15.331032  1.0  10.0  23.0  37.0  54.0
No description has been provided for this image
Descriptive Statistics for County by Potential Fraud:
                   count        mean         std  min    25%    50%    75%  \
PotentialFraud                                                               
No              345415.0  376.183579  271.528928  0.0  140.0  340.0  570.0   
Yes             212796.0  382.491419  254.586720  0.0  160.0  360.0  570.0   

                  max  
PotentialFraud         
No              999.0  
Yes             999.0  
No description has been provided for this image
Descriptive Statistics for NoOfMonths_PartACov by Potential Fraud:
                   count       mean       std  min   25%   50%   75%   max
PotentialFraud                                                            
No              345415.0  11.927965  0.911568  0.0  12.0  12.0  12.0  12.0
Yes             212796.0  11.937165  0.853015  0.0  12.0  12.0  12.0  12.0
No description has been provided for this image
Descriptive Statistics for NoOfMonths_PartBCov by Potential Fraud:
                   count       mean       std  min   25%   50%   75%   max
PotentialFraud                                                            
No              345415.0  11.937562  0.795000  0.0  12.0  12.0  12.0  12.0
Yes             212796.0  11.940732  0.770898  0.0  12.0  12.0  12.0  12.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Alzheimer by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.600538  0.489788  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.594226  0.491042  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Heartfailure by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.413063  0.492385  1.0  1.0  1.0  2.0  2.0
Yes             212796.0  1.403908  0.490681  1.0  1.0  1.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_KidneyDisease by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.594294  0.491029  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.577779  0.493915  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Cancer by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.849668  0.357397  1.0  2.0  2.0  2.0  2.0
Yes             212796.0  1.846905  0.360080  1.0  2.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_ObstrPulmonary by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.691464  0.461890  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.679938  0.466501  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Depression by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.565375  0.495708  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.564898  0.495772  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Diabetes by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.296516  0.456722  1.0  1.0  1.0  2.0  2.0
Yes             212796.0  1.291505  0.454456  1.0  1.0  1.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_IschemicHeart by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.243467  0.429175  1.0  1.0  1.0  1.0  2.0
Yes             212796.0  1.236301  0.424810  1.0  1.0  1.0  1.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_Osteoporasis by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.682784  0.465393  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.681653  0.465836  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_rheumatoidarthritis by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.690645  0.462229  1.0  1.0  2.0  2.0  2.0
Yes             212796.0  1.685882  0.464165  1.0  1.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for ChronicCond_stroke by Potential Fraud:
                   count      mean       std  min  25%  50%  75%  max
PotentialFraud                                                       
No              345415.0  1.900453  0.299395  1.0  2.0  2.0  2.0  2.0
Yes             212796.0  1.894754  0.306871  1.0  2.0  2.0  2.0  2.0
No description has been provided for this image
Descriptive Statistics for IPAnnualReimbursementAmt by Potential Fraud:
                   count         mean           std     min  25%  50%     75%  \
PotentialFraud                                                                  
No              345415.0  4903.940535  11388.465739 -8000.0  0.0  0.0  5000.0   
Yes             212796.0  5753.945375  12386.869727  -500.0  0.0  0.0  7000.0   

                     max  
PotentialFraud            
No              155600.0  
Yes             161470.0  
No description has been provided for this image
Descriptive Statistics for IPAnnualDeductibleAmt by Potential Fraud:
                   count        mean          std  min  25%  50%     75%  \
PotentialFraud                                                             
No              345415.0  534.962431  1137.325961  0.0  0.0  0.0  1068.0   
Yes             212796.0  623.612559  1242.152668  0.0  0.0  0.0  1068.0   

                    max  
PotentialFraud           
No              38272.0  
Yes             38272.0  
No description has been provided for this image
Descriptive Statistics for OPAnnualReimbursementAmt by Potential Fraud:
                   count         mean          std   min    25%     50%  \
PotentialFraud                                                            
No              345415.0  2271.897486  3834.023483 -50.0  470.0  1170.0   
Yes             212796.0  2288.496870  3958.231143 -70.0  460.0  1160.0   

                   75%       max  
PotentialFraud                    
No              2600.0  102960.0  
Yes             2570.0  102960.0  
No description has been provided for this image
Descriptive Statistics for OPAnnualDeductibleAmt by Potential Fraud:
                   count        mean          std  min    25%    50%    75%  \
PotentialFraud                                                                
No              345415.0  647.402655   990.644469  0.0  120.0  340.0  790.0   
Yes             212796.0  653.425807  1020.208428  0.0  120.0  340.0  800.0   

                    max  
PotentialFraud           
No              13840.0  
Yes             13840.0  
In [24]:
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove identifier and date columns (some date columns might still be here if conversion failed)
categorical_cols = [col for col in categorical_cols if col not in ['BeneID', 'ClaimID', 'Provider',
                                                                       'ClaimStartDt', 'ClaimEndDt',
                                                                       'AdmissionDt', 'DischargeDt', 'DOB', 'DOD',
                                                                       'AttendingPhysician', 'OperatingPhysician',
                                                                       'OtherPhysician', 'ClmAdmitDiagnosisCode',
                                                                       'DiagnosisGroupCode',
                                                                       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
                                                                       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
                                                                       'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
                                                                       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
                                                                       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
                                                                       'ClmProcedureCode_1', 'ClmProcedureCode_2',
                                                                       'ClmProcedureCode_3', 'ClmProcedureCode_4',
                                                                       'ClmProcedureCode_5', 'ClmProcedureCode_6']]

for col in categorical_cols:
    if col in train_df.columns:
        plt.figure(figsize=(10, 6))
        # Check the number of unique values to avoid overcrowding the plot
        unique_values = train_df[col].nunique()
        if unique_values > 50:  # it will Adjust this threshold as needed
            print(f"Column '{col}' has {unique_values} unique values. Skipping countplot for visualization.")
            # Calculate and print fraud proportions instead--the most important
            fraud_proportions = train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack()
            print("\nFraud Proportions:\n", fraud_proportions)
        else:
            sns.countplot(data=train_df, x=col, hue='PotentialFraud', order=train_df[col].value_counts().index)
            plt.title(f'Distribution of {col} by Potential Fraud')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()

        print(f"\nValue Counts and Fraud Proportions for {col}:")
        print(train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True))
    else:
        print(f"Column '{col}' not found in train_df.")
No description has been provided for this image
Value Counts and Fraud Proportions for RenalDiseaseIndicator:
RenalDiseaseIndicator  PotentialFraud
0                      No                0.620607
                       Yes               0.379393
Y                      No                0.611372
                       Yes               0.388628
Name: proportion, dtype: float64
No description has been provided for this image
Value Counts and Fraud Proportions for PotentialFraud:
PotentialFraud
No     1.0
Yes    1.0
Name: proportion, dtype: float64
In [25]:
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['BeneID', 'ClaimID', 'Provider',
                                                                       'ClaimStartDt', 'ClaimEndDt',
                                                                       'AdmissionDt', 'DischargeDt', 'DOB', 'DOD',
                                                                       'AttendingPhysician', 'OperatingPhysician',
                                                                       'OtherPhysician', 'ClmAdmitDiagnosisCode',
                                                                       'DiagnosisGroupCode',
                                                                       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
                                                                       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
                                                                       'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
                                                                       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
                                                                       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
                                                                       'ClmProcedureCode_1', 'ClmProcedureCode_2',
                                                                       'ClmProcedureCode_3', 'ClmProcedureCode_4',
                                                                       'ClmProcedureCode_5', 'ClmProcedureCode_6']]

for col in categorical_cols:
    if col in train_df.columns:
        print(f"\n--- Analysis of Categorical Feature: {col} ---")
        unique_values = train_df[col].nunique()

        # 1. Check Cardinality
        if unique_values > 50:  # it will Adjust threshold if needed
            print(f"Column '{col}' has high cardinality ({unique_values} unique values). Displaying fraud proportions.")
            fraud_proportions = train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack()
            print(fraud_proportions)


            top_n = 10
            top_categories = train_df[col].value_counts().head(top_n).index
            top_categories_df = train_df[train_df[col].isin(top_categories)]
            print(f"\nTop {top_n} Categories Fraud Proportions:")
            print(top_categories_df.groupby(col)['PotentialFraud'].value_counts(normalize=True).unstack())
        else:

            plt.figure(figsize=(10, 6))
            sns.countplot(data=train_df, x=col, hue='PotentialFraud', order=train_df[col].value_counts().index)
            plt.title(f'Distribution of {col} by Potential Fraud')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()


        print("\nOverall Fraud Proportions:")
        print(train_df.groupby(col)['PotentialFraud'].value_counts(normalize=True))

    else:
        print(f"Column '{col}' not found in train_df.")
--- Analysis of Categorical Feature: RenalDiseaseIndicator ---
No description has been provided for this image
Overall Fraud Proportions:
RenalDiseaseIndicator  PotentialFraud
0                      No                0.620607
                       Yes               0.379393
Y                      No                0.611372
                       Yes               0.388628
Name: proportion, dtype: float64

--- Analysis of Categorical Feature: PotentialFraud ---
No description has been provided for this image
Overall Fraud Proportions:
PotentialFraud
No     1.0
Yes    1.0
Name: proportion, dtype: float64
In [26]:
date_cols = ['ClaimStartDt', 'ClaimEndDt', 'DOB', 'DOD']

if 'AdmissionDt' in train_df.columns:
    date_cols.append('AdmissionDt')
if 'DischargeDt' in train_df.columns:
    date_cols.append('DischargeDt')

for col in date_cols:
    if col in train_df.columns:
        print(f"\n--- Analysis of Date Feature: {col} ---")

        try:
            train_df[col] = pd.to_datetime(train_df[col], errors='coerce')
            print(f"Converted '{col}' to datetime.")
        except ValueError as e:
            print(f"Column '{col}' cannot be converted to datetime: {e}. Skipping analysis.")
            continue


        print("\nBasic Statistics:")
        print(train_df[col].describe())


        if 'ClaimStartDt' in train_df.columns and 'ClaimEndDt' in train_df.columns:  # More robust check
            if pd.api.types.is_datetime64_any_dtype(train_df['ClaimStartDt']) and pd.api.types.is_datetime64_any_dtype(train_df['ClaimEndDt']):
                train_df['ClaimDuration'] = (train_df['ClaimEndDt'] - train_df['ClaimStartDt']).dt.days
                print(f"\nClaim Duration Statistics:")
                print(train_df['ClaimDuration'].describe())

                plt.figure(figsize=(8, 5))
                sns.histplot(train_df['ClaimDuration'].dropna(), kde=True)
                plt.title('Distribution of Claim Duration')
                plt.show()

                plt.figure(figsize=(8, 5))
                sns.boxplot(x='PotentialFraud', y='ClaimDuration', data=train_df)
                plt.title('Claim Duration by Potential Fraud')
                plt.show()
            else:
                print("ClaimStartDt or ClaimEndDt are not proper datetime columns. Skipping ClaimDuration calculation.")


        if col == 'DOB' and 'DOB' in train_df.columns:  # Robust check
            train_df['Age'] = ((pd.to_datetime('2009-12-31') - train_df['DOB']).dt.days / 365.25).astype(int)
            print("\nAge Statistics:")
            print(train_df['Age'].describe())

            plt.figure(figsize=(8, 5))
            sns.histplot(train_df['Age'].dropna(), kde=True)
            plt.title('Distribution of Age')
            plt.show()

            plt.figure(figsize=(8, 5))
            sns.boxplot(x='PotentialFraud', y='Age', data=train_df)
            plt.title('Age by Potential Fraud')
            plt.show()


        if col == 'DOD' and 'DOD' in train_df.columns:  # Robust check
            train_df['DOD_Flag'] = train_df['DOD'].notna().astype(int)
            print("\nDistribution of DOD_Flag:")
            print(train_df['DOD_Flag'].value_counts(normalize=True))
            print("\nFraud Proportions by DOD_Flag:")
            print(train_df.groupby('DOD_Flag')['PotentialFraud'].value_counts(normalize=True))

    else:
        print(f"Column '{col}' not found in train_df.")
--- Analysis of Date Feature: ClaimStartDt ---
Converted 'ClaimStartDt' to datetime.

Basic Statistics:
count                           558211
mean     2009-06-24 23:39:21.603766528
min                2008-11-27 00:00:00
25%                2009-03-27 00:00:00
50%                2009-06-23 00:00:00
75%                2009-09-22 00:00:00
max                2009-12-31 00:00:00
Name: ClaimStartDt, dtype: object
ClaimStartDt or ClaimEndDt are not proper datetime columns. Skipping ClaimDuration calculation.

--- Analysis of Date Feature: ClaimEndDt ---
Converted 'ClaimEndDt' to datetime.

Basic Statistics:
count                           558211
mean     2009-06-26 17:07:35.601913600
min                2008-12-28 00:00:00
25%                2009-03-29 00:00:00
50%                2009-06-24 00:00:00
75%                2009-09-23 00:00:00
max                2009-12-31 00:00:00
Name: ClaimEndDt, dtype: object

Claim Duration Statistics:
count    558211.000000
mean          1.727940
std           4.904984
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          36.000000
Name: ClaimDuration, dtype: float64
No description has been provided for this image
No description has been provided for this image
--- Analysis of Date Feature: DOB ---
Converted 'DOB' to datetime.

Basic Statistics:
count                           558211
mean     1936-03-07 23:58:27.131890944
min                1909-01-01 00:00:00
25%                1927-07-01 00:00:00
50%                1935-05-01 00:00:00
75%                1941-12-01 00:00:00
max                1983-12-01 00:00:00
Name: DOB, dtype: object

Claim Duration Statistics:
count    558211.000000
mean          1.727940
std           4.904984
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          36.000000
Name: ClaimDuration, dtype: float64
No description has been provided for this image
No description has been provided for this image
Age Statistics:
count    558211.000000
mean         73.276820
std          13.011761
min          26.000000
25%          68.000000
50%          74.000000
75%          82.000000
max         100.000000
Name: Age, dtype: float64
No description has been provided for this image
No description has been provided for this image
--- Analysis of Date Feature: DOD ---
Converted 'DOD' to datetime.

Basic Statistics:
count                             4131
mean     2009-08-20 15:32:27.712418048
min                2009-02-01 00:00:00
25%                2009-07-01 00:00:00
50%                2009-09-01 00:00:00
75%                2009-11-01 00:00:00
max                2009-12-01 00:00:00
Name: DOD, dtype: object

Claim Duration Statistics:
count    558211.000000
mean          1.727940
std           4.904984
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          36.000000
Name: ClaimDuration, dtype: float64
No description has been provided for this image
No description has been provided for this image
Distribution of DOD_Flag:
DOD_Flag
0    0.9926
1    0.0074
Name: proportion, dtype: float64

Fraud Proportions by DOD_Flag:
DOD_Flag  PotentialFraud
0         No                0.618734
          Yes               0.381266
1         No                0.626241
          Yes               0.373759
Name: proportion, dtype: float64

--- Analysis of Date Feature: AdmissionDt ---
Converted 'AdmissionDt' to datetime.

Basic Statistics:
count                            40474
mean     2009-06-19 17:38:12.493946880
min                2008-11-27 00:00:00
25%                2009-03-20 00:00:00
50%                2009-06-16 00:00:00
75%                2009-09-17 00:00:00
max                2009-12-31 00:00:00
Name: AdmissionDt, dtype: object

Claim Duration Statistics:
count    558211.000000
mean          1.727940
std           4.904984
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          36.000000
Name: ClaimDuration, dtype: float64
No description has been provided for this image
No description has been provided for this image
--- Analysis of Date Feature: DischargeDt ---
Converted 'DischargeDt' to datetime.

Basic Statistics:
count                            40474
mean     2009-06-25 09:36:02.988585216
min                2009-01-01 00:00:00
25%                2009-03-26 00:00:00
50%                2009-06-22 00:00:00
75%                2009-09-23 00:00:00
max                2009-12-31 00:00:00
Name: DischargeDt, dtype: object

Claim Duration Statistics:
count    558211.000000
mean          1.727940
std           4.904984
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          36.000000
Name: ClaimDuration, dtype: float64
No description has been provided for this image
No description has been provided for this image
In [27]:
def create_date_features(df):
    """
    Creates various date-related features from the date columns.

    Args:
        df (pd.DataFrame): Input DataFrame containing date columns.

    Returns:
        pd.DataFrame: DataFrame with added date features.
    """

    if 'ClaimStartDt' in df.columns:
        df['ClaimStartDt'] = pd.to_datetime(df['ClaimStartDt'], errors='coerce')
        df['ClaimStart_Month'] = df['ClaimStartDt'].dt.month
        df['ClaimStart_Quarter'] = df['ClaimStartDt'].dt.quarter
        df['ClaimStart_Year'] = df['ClaimStartDt'].dt.year
        df['ClaimStart_DayOfWeek'] = df['ClaimStartDt'].dt.dayofweek  # Monday=0, Sunday=6

    if 'ClaimEndDt' in df.columns:
        df['ClaimEndDt'] = pd.to_datetime(df['ClaimEndDt'], errors='coerce')
        df['ClaimEnd_Month'] = df['ClaimEndDt'].dt.month
        df['ClaimEnd_Quarter'] = df['ClaimEndDt'].dt.quarter
        df['ClaimEnd_Year'] = df['ClaimEndDt'].dt.year
        df['ClaimEnd_DayOfWeek'] = df['ClaimEndDt'].dt.dayofweek

    if 'AdmissionDt' in df.columns and 'DischargeDt' in df.columns:
        df['AdmissionDt'] = pd.to_datetime(df['AdmissionDt'], errors='coerce')
        df['DischargeDt'] = pd.to_datetime(df['DischargeDt'], errors='coerce')
        df['LengthOfStay'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days
        df['Admission_Month'] = df['AdmissionDt'].dt.month
        df['Admission_Quarter'] = df['AdmissionDt'].dt.quarter
        df['Admission_Year'] = df['AdmissionDt'].dt.year
        df['Admission_DayOfWeek'] = df['AdmissionDt'].dt.dayofweek
        df['Discharge_Month'] = df['DischargeDt'].dt.month
        df['Discharge_Quarter'] = df['DischargeDt'].dt.quarter
        df['Discharge_Year'] = df['DischargeDt'].dt.year
        df['Discharge_DayOfWeek'] = df['DischargeDt'].dt.dayofweek

    if 'DOB' in df.columns:
        df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
        df['Age'] = ((pd.to_datetime('2009-12-31') - df['DOB']).dt.days / 365.25).astype(int)

    return df

train_df = create_date_features(train_df)
print("Shape after date feature engineering:", train_df.shape)
print(train_df[['ClaimStart_Month', 'ClaimStart_Year', 'LengthOfStay', 'Age']].head())
Shape after date feature engineering: (558211, 75)
   ClaimStart_Month  ClaimStart_Year  LengthOfStay  Age
0                 4             2009           6.0   66
1                 8             2009           2.0   66
2                 9             2009           3.0   66
3                 2             2009           8.0   95
4                 8             2009          17.0   71
In [28]:
def create_claim_code_features(df):
    """
    Creates features from diagnosis and procedure codes.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: DataFrame with added claim code features.
    """

    diagnosis_cols = [col for col in df.columns if col.startswith('ClmDiagnosisCode_')]
    procedure_cols = [col for col in df.columns if col.startswith('ClmProcedureCode_')]

    # 1. Count of Diagnosis Codes
    df['DiagnosisCode_Count'] = df[diagnosis_cols].notna().sum(axis=1)

    # 2. Count of Procedure Codes
    df['ProcedureCode_Count'] = df[procedure_cols].notna().sum(axis=1)

    # 3. Principal Diagnosis Code (First one)
    if 'ClmDiagnosisCode_1' in df.columns:
        df['PrincipalDiagnosis'] = df['ClmDiagnosisCode_1'].str[:3]  # We have to keep only the first 3 characters

    # 4. Any Diagnosis Code Present Flags
    for col in diagnosis_cols:
      df[f'{col}_Flag'] = df[col].notna().astype(int)

    for col in procedure_cols:
      df[f'{col}_Flag'] = df[col].notna().astype(int)

    return df

train_df = create_claim_code_features(train_df)
print("Shape after claim code feature engineering:", train_df.shape)
print(train_df[['DiagnosisCode_Count', 'ProcedureCode_Count', 'PrincipalDiagnosis']].head())
Shape after claim code feature engineering: (558211, 94)
   DiagnosisCode_Count  ProcedureCode_Count PrincipalDiagnosis
0                    9                    0                197
1                    3                    1                618
2                    6                    0                296
3                    9                    1                434
4                    9                    1                042
In [29]:
import numpy as np  # Import numpy for Mathematical operations

def create_beneficiary_features(df):
    """
    Creates features from beneficiary-related information (optimized for speed).

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: DataFrame with added beneficiary features.
    """

    # 1. Chronic Condition Count (Optimized)
    chronic_cols = [col for col in df.columns if col.startswith('Chronic')]
    if chronic_cols:  # Check if there are any chronic condition columns
        df['ChronicCondition_Count'] = (df[chronic_cols] == 'Yes').sum(axis=1)


    if 'RenalDiseaseIndicator' in df.columns:
        df['RenalDiseaseIndicator_Flag'] = (df['RenalDiseaseIndicator'].str.lower() == 'yes').astype(int)
        df.drop(columns=['RenalDiseaseIndicator'], inplace=True, errors='ignore')


    if 'DOD_Flag' not in df.columns and 'DOD' in df.columns:
        df['DOD_Flag'] = df['DOD'].notna().astype(int)

    return df

train_df = create_beneficiary_features(train_df)
print("Shape after beneficiary feature engineering:", train_df.shape)
print(train_df[['ChronicCondition_Count', 'RenalDiseaseIndicator_Flag']].head())
Shape after beneficiary feature engineering: (558211, 95)
   ChronicCondition_Count  RenalDiseaseIndicator_Flag
0                       0                           0
1                       0                           0
2                       0                           0
3                       0                           0
4                       0                           0
In [30]:
def select_correlated_features(df, threshold=0.8):
    """
    Selects features by removing highly correlated ones.

    Args:
        df (pd.DataFrame): Input DataFrame.
        threshold (float): Correlation threshold.

    Returns:
        pd.DataFrame: DataFrame with selected features.
        list: List of dropped columns.
    """

    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    numerical_cols = [col for col in numerical_cols if col not in ['PotentialFraud']]  # Exclude target

    corr_matrix = df[numerical_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]

    selected_df = df.drop(columns=to_drop, errors='ignore')  # Add errors='ignore'
    return selected_df, to_drop

train_df_selected_corr, dropped_cols_corr = select_correlated_features(train_df)
print("Shape after correlation-based feature selection:", train_df_selected_corr.shape)
print("Dropped columns (correlation):", dropped_cols_corr)
Shape after correlation-based feature selection: (558211, 73)
Dropped columns (correlation): ['OPAnnualDeductibleAmt', 'ClaimDuration', 'ClaimStart_Quarter', 'ClaimEnd_Month', 'ClaimEnd_Quarter', 'ClaimEnd_DayOfWeek', 'LengthOfStay', 'Admission_Month', 'Admission_Quarter', 'Admission_Year', 'Admission_DayOfWeek', 'Discharge_Month', 'Discharge_Quarter', 'Discharge_DayOfWeek', 'ClmDiagnosisCode_4_Flag', 'ClmDiagnosisCode_5_Flag', 'ClmDiagnosisCode_6_Flag', 'ClmDiagnosisCode_7_Flag', 'ClmDiagnosisCode_8_Flag', 'ClmDiagnosisCode_9_Flag', 'ClmDiagnosisCode_10_Flag', 'ClmProcedureCode_1_Flag']
In [31]:
from sklearn.ensemble import RandomForestClassifier

def select_top_features_rf_optimized(df, target_col='PotentialFraud', top_n=20):
    """
    Selects the top N features based on Random Forest feature importance (optimized for speed).

    Args:
        df (pd.DataFrame): Input DataFrame.
        target_col (str): Name of the target column.
        top_n (int): Number of top features to select.

    Returns:
        pd.DataFrame: DataFrame with selected features.
        list: List of selected feature names.
    """

    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    numerical_cols = [col for col in numerical_cols if col not in [target_col]]

    X = df[numerical_cols]
    y = df[target_col]

    model = RandomForestClassifier(
        n_estimators=50,  # Reduced number of trees
        max_depth=10,       # Limited tree depth
        max_features='sqrt', # Feature subsampling
        n_jobs=-1,          # Parallelization
        random_state=42,
        class_weight='balanced'
    )
    model.fit(X, y)

    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

    top_features = feature_importance['Feature'].head(top_n).tolist()
    selected_df = df[[target_col] + top_features]

    return selected_df, top_features

train_df_selected_rf_optimized, top_features_rf_optimized = select_top_features_rf_optimized(train_df)
print("Shape after Random Forest-based feature selection:", train_df_selected_rf_optimized.shape)
print("Top features (Random Forest):", top_features_rf_optimized)
Shape after Random Forest-based feature selection: (558211, 21)
Top features (Random Forest): ['State', 'County', 'Admission_Quarter', 'LengthOfStay', 'DeductibleAmtPaid', 'InscClaimAmtReimbursed', 'Discharge_Month', 'Race', 'Discharge_DayOfWeek', 'Admission_Year', 'Admission_DayOfWeek', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Admission_Month', 'Age', 'IPAnnualReimbursementAmt', 'Discharge_Quarter', 'ClaimDuration', 'IPAnnualDeductibleAmt', 'ClmProcedureCode_1_Flag']
In [32]:
def prepare_data_for_modeling(df, target_col='PotentialFraud'):
    """
    Prepares the data for modeling by handling missing values and encoding categorical features.

    Args:
        df (pd.DataFrame): Input DataFrame.
        target_col (str): Name of the target column.

    Returns:
        pd.DataFrame: Prepared DataFrame.
        list: List of categorical columns that were encoded.
    """

    df = df.copy()  # Work on a copy to avoid modifying the original DataFrame

    # 1. Handle Missing Values
    for col in df.columns:
        if df[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                df[col] = df[col].fillna(df[col].median())  # Impute numerical with median(avg)
            else:
                df[col] = df[col].fillna(df[col].mode()[0])  # Impute categorical with mode

    # 2. Encode Categorical Features
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    categorical_cols = [col for col in categorical_cols if col != target_col]  # Exclude target

    encoded_cols = []
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
        encoded_cols = [col for col in df.columns if any(part in col for part in categorical_cols)]

    return df, encoded_cols

train_df_prepared, encoded_cols = prepare_data_for_modeling(train_df_selected_rf_optimized)
print("Shape after data preparation:", train_df_prepared.shape)
print("Encoded columns:", encoded_cols)
print("\nPrepared data head:")
print(train_df_prepared.head())
Shape after data preparation: (558211, 21)
Encoded columns: []

Prepared data head:
  PotentialFraud  State  County  Admission_Quarter  LengthOfStay  \
0            Yes     39     230                2.0           6.0   
1             No     39     230                3.0           2.0   
2             No     39     230                3.0           3.0   
3             No      1     360                1.0           8.0   
4             No     45     780                3.0          17.0   

   DeductibleAmtPaid  InscClaimAmtReimbursed  Discharge_Month  Race  \
0             1068.0                   26000              4.0     1   
1             1068.0                    5000              9.0     1   
2             1068.0                    5000              9.0     1   
3             1068.0                    5000              2.0     2   
4             1068.0                   10000              8.0     1   

   Discharge_DayOfWeek  ...  Admission_DayOfWeek  OPAnnualReimbursementAmt  \
0                  5.0  ...                  6.0                        60   
1                  2.0  ...                  0.0                        60   
2                  6.0  ...                  3.0                        60   
3                  6.0  ...                  5.0                       250   
4                  6.0  ...                  3.0                       120   

   OPAnnualDeductibleAmt  Admission_Month  Age  IPAnnualReimbursementAmt  \
0                     70              4.0   66                     36000   
1                     70              8.0   66                     36000   
2                     70              9.0   66                     36000   
3                    320              2.0   95                      5000   
4                    100              8.0   71                     21260   

   Discharge_Quarter  ClaimDuration  IPAnnualDeductibleAmt  \
0                2.0              6                   3204   
1                3.0              2                   3204   
2                3.0              3                   3204   
3                1.0              8                   1068   
4                3.0             17                   2136   

   ClmProcedureCode_1_Flag  
0                        0  
1                        1  
2                        0  
3                        1  
4                        1  

[5 rows x 21 columns]
In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data = {'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Feature2': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
        'PotentialFraud': [0, 0, 0, 0, 1, 1, 1, 0, 0, 1],
        'Provider': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']}
final_df = pd.DataFrame(data)

X = final_df.drop(columns=['Provider', 'PotentialFraud'])
y = final_df['PotentialFraud']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

In [34]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

# 1. AUC-ROC Curve

# Get predicted probabilities for the positive class (PotentialFraud = 1)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

print(f'AUC Score: {auc_score}')
# These are the most crucial steps
# 2. Precision-Recall Curve

# Calculate Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_pr = auc(recall, precision)

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AUPRC = {auc_pr:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

print(f'AUPRC Score: {auc_pr}')
No description has been provided for this image
AUC Score: 1.0
No description has been provided for this image
AUPRC Score: 1.0
In [35]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='roc_auc'
)

# Fit the RandomizedSearchCV
rf_random.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", rf_random.best_params_)

# Best model
best_rf_model = rf_random.best_estimator_

# Evaluate the best model (you can reuse the evaluation code from previous steps)
y_pred = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score}")
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

AUC Score: 1.0
In [36]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Logistic Regression
logreg_model = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_logreg))

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]))  # scale_pos_weight handles imbalance
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))
Logistic Regression:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

XGBoost:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [13:17:59] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

inpatient_df = pd.read_csv('/content/Train_Inpatientdata-1542865627584.csv')
outpatient_df = pd.read_csv('/content/Train_Outpatientdata-1542865627584.csv')
beneficiary_df = pd.read_csv('/content/Train_Beneficiarydata-1542865627584.csv')
labels_df = pd.read_csv('/content/Train-1542865627584.csv')

def create_date_features(df):
    for col in ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[col + '_year'] = df[col].dt.year
            df[col + '_month'] = df[col].dt.month
            df[col + '_day'] = df[col].dt.day
            df[col + '_dayofweek'] = df[col].dt.dayofweek
    if 'AdmissionDt' in df.columns and 'DischargeDt' in df.columns:
        df['LengthOfStay'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days
        df['AdmitForDays'] = df['LengthOfStay']
    return df

def create_claim_code_features(df):
    for prefix in ['ClmDiagnosisCode_', 'ClmProcedureCode_']:
        cols = [col for col in df.columns if col.startswith(prefix)]
        if cols:
            df[prefix + 'count'] = df[cols].count(axis=1)
    return df

def create_beneficiary_features(df):
    for prefix in ['ChronicCond_']:
        cols = [col for col in df.columns if col.startswith(prefix)]
        if cols:
            df[prefix + 'count'] = (df[cols] == 'Yes').sum(axis=1)
    if 'RenalDiseaseIndicator' in df.columns:
        df['RenalDiseaseIndicator'] = df['RenalDiseaseIndicator'].str.lower() == 'yes'
    return df

def create_age_bins(df):
    if 'Age' in df.columns:
        df['Age_bins'] = pd.cut(df['Age'], bins=[0, 65, 75, float('inf')], labels=[0, 1, 2])
    return df

# Preprocess dataframes
inpatient_df = create_date_features(inpatient_df)
outpatient_df = create_date_features(outpatient_df)
inpatient_df = create_claim_code_features(inpatient_df)
outpatient_df = create_claim_code_features(outpatient_df)
beneficiary_df = create_beneficiary_features(beneficiary_df)
beneficiary_df = create_age_bins(beneficiary_df)

# Concatenate both inpatient and outpatient dataframes
claims_df = pd.concat([inpatient_df, outpatient_df], ignore_index=True)

# Merge claims with beneficiary data using BeneID
print("claims_df columns before merge:", claims_df.columns)
print("beneficiary_df columns before merge:", beneficiary_df.columns)

claims_merged = claims_df.merge(beneficiary_df, on='BeneID', how='left')

print("claims_merged columns after merge:", claims_merged.columns)

# Aggregate features by Provider
def aggregate_provider_features(df):
    agg_funcs = {
        'ClaimID': 'count',
        'InscClaimAmtReimbursed': 'sum',
        'DeductibleAmtPaid': 'sum',
        'AdmitForDays': 'sum',
        'DiagnosisGroupCode': 'nunique',
        'ClmProcedureCode_count': 'sum',
        'ClmDiagnosisCode_count': 'sum',
        'LengthOfStay': 'sum',
        'ChronicCond_count': 'sum',
        'Age_bins': 'sum'
    }
    # Check if columns exist before aggregating
    cols_to_agg = {col: agg_funcs[col] for col in agg_funcs if col in df.columns}
    if not cols_to_agg:
        return pd.DataFrame()  # Return empty DataFrame if no columns to aggregate
    agg_df = df.groupby('Provider').agg(cols_to_agg)
    agg_df.columns = ['Provider_' + col if col != 'Provider' else col for col in agg_df.columns]
    if 'InscClaimAmtReimbursed' in df.columns:
        agg_df['Provider_Avg_InscClaimAmtReimbursed'] = df.groupby('Provider')['InscClaimAmtReimbursed'].mean()
    if 'DeductibleAmtPaid' in df.columns:
        agg_df['Provider_Avg_DeductibleAmtPaid'] = df.groupby('Provider')['DeductibleAmtPaid'].mean()
    if 'AdmitForDays' in df.columns:
        agg_df['Provider_Avg_AdmitForDays'] = df.groupby('Provider')['AdmitForDays'].mean()
    return agg_df.reset_index()

provider_features = aggregate_provider_features(claims_merged)

final_df = provider_features.merge(labels_df, on='Provider', how='left')
final_df.drop(columns=['Provider'], inplace=True)

final_df['PotentialFraud'] = final_df['PotentialFraud'].map({'Yes': 1, 'No': 0})

X = final_df.drop(columns=['PotentialFraud'])
y = final_df['PotentialFraud'].astype(int)

numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Impute missing values ---
imputer = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', etc.

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('imputer', imputer)])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# --- Models ---
# Logistic Regression
logreg_model = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
y_proba_logreg = logreg_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1
print("Logistic Regression:\n", classification_report(y_test, y_pred_logreg))

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]))
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))

# --- Prediction on Unseen Data and CSV Creation ---

unseen_inpatient_df = pd.read_csv('/content/Unseen_Inpatientdata-1542969243754.csv')
unseen_outpatient_df = pd.read_csv('/content/Unseen_Outpatientdata-1542969243754.csv')
unseen_beneficiary_df = pd.read_csv('/content/Unseen_Beneficiarydata-1542969243754.csv')
unseen_labels_df = pd.read_csv('/content/Unseen-1542969243754.csv')

unseen_inpatient_df = create_date_features(unseen_inpatient_df)
unseen_outpatient_df = create_date_features(unseen_outpatient_df)
unseen_inpatient_df = create_claim_code_features(unseen_inpatient_df)
unseen_outpatient_df = create_claim_code_features(unseen_outpatient_df)
unseen_beneficiary_df = create_beneficiary_features(unseen_beneficiary_df)
unseen_beneficiary_df = create_age_bins(unseen_beneficiary_df)

unseen_claims_df = pd.concat([unseen_inpatient_df, unseen_outpatient_df], ignore_index=True)
unseen_claims_merged = unseen_claims_df.merge(unseen_beneficiary_df, on='BeneID', how='left')
unseen_provider_features = aggregate_provider_features(unseen_claims_merged)

# Align columns - VERY IMPORTANT
train_cols = X.columns
unseen_provider_features = unseen_provider_features.reindex(columns=train_cols, fill_value=0) # Use fill_value=0 for missing cols

unseen_processed = pipeline.transform(unseen_provider_features)

unseen_probabilities = xgb_model.predict_proba(unseen_processed)[:, 1]
unseen_predictions = xgb_model.predict(unseen_processed)

submission_df = pd.DataFrame({
    'Provider': unseen_labels_df['Provider'],
    'Probability': unseen_probabilities,
    'PredictedClass': unseen_predictions
})

submission_df.to_csv('Vedant Avinash Patil_Submission.csv', index=False)

from google.colab import files
files.download('Vedant Avinash Patil_Submission.csv')

print("Submission file created: Vedant Avinash Patil_Submission.csv")
claims_df columns before merge: Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6', 'ClaimStartDt_year', 'ClaimStartDt_month',
       'ClaimStartDt_day', 'ClaimStartDt_dayofweek', 'ClaimEndDt_year',
       'ClaimEndDt_month', 'ClaimEndDt_day', 'ClaimEndDt_dayofweek',
       'AdmissionDt_year', 'AdmissionDt_month', 'AdmissionDt_day',
       'AdmissionDt_dayofweek', 'DischargeDt_year', 'DischargeDt_month',
       'DischargeDt_day', 'DischargeDt_dayofweek', 'LengthOfStay',
       'AdmitForDays', 'ClmDiagnosisCode_count', 'ClmProcedureCode_count'],
      dtype='object')
beneficiary_df columns before merge: Index(['BeneID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator',
       'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ChronicCond_count'],
      dtype='object')
claims_merged columns after merge: Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6', 'ClaimStartDt_year', 'ClaimStartDt_month',
       'ClaimStartDt_day', 'ClaimStartDt_dayofweek', 'ClaimEndDt_year',
       'ClaimEndDt_month', 'ClaimEndDt_day', 'ClaimEndDt_dayofweek',
       'AdmissionDt_year', 'AdmissionDt_month', 'AdmissionDt_day',
       'AdmissionDt_dayofweek', 'DischargeDt_year', 'DischargeDt_month',
       'DischargeDt_day', 'DischargeDt_dayofweek', 'LengthOfStay',
       'AdmitForDays', 'ClmDiagnosisCode_count', 'ClmProcedureCode_count',
       'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
       'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ChronicCond_count'],
      dtype='object')
Logistic Regression:
               precision    recall  f1-score   support

           0       0.98      0.88      0.93       981
           1       0.43      0.86      0.58       101

    accuracy                           0.88      1082
   macro avg       0.71      0.87      0.75      1082
weighted avg       0.93      0.88      0.90      1082

/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [13:18:06] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
XGBoost:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       981
           1       0.54      0.63      0.58       101

    accuracy                           0.92      1082
   macro avg       0.75      0.79      0.77      1082
weighted avg       0.92      0.92      0.92      1082

Submission file created: Vedant Avinash Patil_Submission.csv
In [ ]: